Code
library(forecast)
library(tseries)
library(tidyverse)
library(tidymodels)
library(kableExtra)
library(modeltime)
library(prophet)
library(timetk)
library(Metrics)
library(zoo)
library(MLmetrics)
set.seed(123)Predicting COVID-19 Deaths (STAT 390)
library(forecast)
library(tseries)
library(tidyverse)
library(tidymodels)
library(kableExtra)
library(modeltime)
library(prophet)
library(timetk)
library(Metrics)
library(zoo)
library(MLmetrics)
set.seed(123)selected_death <- read_csv("data/finaldata_selectedfeatures.csv")selected_deaths <- selected_death %>%
rename(ds = date,
y = covid_19_deaths) %>%
select(-c(starts_with("deaths_"), "dayofweek", "quarter", "month", "dayofyear", "dayofmonth",
"weekofyear"))
# LOCF
selected_deaths <- fill(selected_deaths, everything())
selected_deaths <- replace(selected_deaths, is.na(selected_deaths), 100000000)
naniar::miss_var_summary(selected_deaths)# A tibble: 67 × 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 ds 0 0
2 y 0 0
3 administered_5plus 0 0
4 distributed_novavax 0 0
5 series_complete_moderna_65plus 0 0
6 administered_dose1_recip_18plus 0 0
7 additional_doses_12plus 0 0
8 administered_dose1_recip_65plus 0 0
9 administered_dose1_recip_12plus 0 0
10 administered_dose1_recip_18plus_pop_pct 0 0
# ℹ 57 more rows
selected_deaths <- selected_deaths %>%
mutate(y = case_when((y == 0) ~ 1,
.default = y))
# selected_deaths <- selected_deaths %>%
# mutate(y = log(y))#splits <- time_series_split(selected_deaths, initial = "148 weeks", assess = "17 weeks")
#train_data <- training(splits)
#test_data <- testing(splits)
train_data <- selected_deaths %>%
filter(year != 2023) %>%
select(-c(year))
test_data <- selected_deaths %>%
filter(year == 2023) %>%
select(-c(year))# Use default parameters to initiate Prophet Model
# Fit model on training set
baseline_model <- prophet(train_data)# Create time range for forecast
future_baseline <- make_future_dataframe(baseline_model, freq = "week", periods = 8)
# Make prediction
forecast_baseline <- predict(baseline_model, future_baseline)
# Visualize the forecast
plot(baseline_model, forecast_baseline) +
labs(title = "Baseline Prophet Model")prophet_plot_components(baseline_model, forecast_baseline)# Merge actual and predicted values
forecast_baseline1 <- predict(baseline_model, selected_deaths)
forecast_baseline2 <- forecast_baseline1 %>%
filter(ds > as.POSIXct("2023-01-01")) %>%
mutate(index = seq(1:332))
test_data <- test_data %>%
mutate(index = seq(1:332))
performance_baseline <- full_join(test_data, forecast_baseline2, by = join_by(ds == ds, index == index)) %>%
select(ds, y, yhat, index)
#
# performance_baseline <- performance_baseline %>%
# mutate(y = exp(y),
# yhat = exp(yhat))ggplot(performance_baseline) +
geom_line(aes(x = ds, y = y, color = "orange"), linewidth = 1) +
#geom_point(aes(x = index, y = y, color = "orange")) +
geom_line(aes(x = ds, y = yhat, color = "dodgerblue"), linewidth = 1) +
#geom_point(aes(x = index, y = yhat, color = "dodgerblue")) +
scale_color_manual(name="Value", values = c("orange", "dodgerblue"), labels = c("Predicted", "Actual")) +
labs(title = "Baseline Prophet Test Set",
y = "Deaths",
x = "Dates") +
theme_bw() +
theme(legend.title = element_blank())# Check MAE value
baseline_mae <- mae(performance_baseline$y, performance_baseline$yhat)
print(paste("The MAE for the baseline model is", baseline_mae))[1] "The MAE for the baseline model is 109.395757854377"
# Check MASE value
baseline_mase <- mase(performance_baseline$y, performance_baseline$yhat)
print(paste("The MASE for the baseline model is", baseline_mase))[1] "The MASE for the baseline model is 1.79097813086353"
plot(baseline_model, forecast_baseline) + add_changepoints_to_plot(baseline_model)# Zooming in
plot(baseline_model, forecast_baseline) + add_changepoints_to_plot(baseline_model) +
coord_fixed(ylim = c(0, 1000))# Create prophet model with CI of 95%
model_changepoint <- prophet(train_data, interval.width = 0.95, n.changepoints = 3)# Create time range for forecast
future_changepoint <- make_future_dataframe(model_changepoint, freq = "week", periods = 8)
# Make prediction
forecast_changepoint <- predict(model_changepoint, future_changepoint)
# Visualize the forecast
plot(model_changepoint, forecast_changepoint) +
labs(title = "Changepoint Prophet Model")# Visualize forecast components
prophet_plot_components(model_changepoint, forecast_changepoint)# Change points to plot
plot(model_changepoint, forecast_changepoint) + add_changepoints_to_plot(model_changepoint)# Merge actual and predicted values
forecast_changepoint2 <- forecast_changepoint %>%
filter(ds > as.POSIXct("2023-01-01"))
performance_changepoint <- full_join(test_data, forecast_changepoint2, by = join_by(ds == ds)) %>%
select(ds, y, yhat) %>%
mutate(yhat = case_when((yhat < 0) ~ 0,
.default = yhat))
# performance_changepoint <- performance_changepoint %>%
# mutate(y = exp(y),
# yhat = exp(yhat))
ggplot(performance_changepoint) +
geom_line(aes(x = ds, y = y, color = "orange"), linewidth = 1) +
#geom_point(aes(x = ds, y = y, color = "orange")) +
geom_line(aes(x = ds, y = yhat, color = "dodgerblue"), linewidth = 1) +
#geom_point(aes(x = ds, y = yhat, color = "dodgerblue")) +
scale_color_manual(name="Value", values = c("orange", "dodgerblue"), labels = c("Predicted", "Actual")) +
labs(title = "Changepoint Prophet Test Set",
y = "Deaths",
x = "Dates") +
theme_bw() +
theme(legend.title = element_blank())# Check MAE value
changepoint_mae <- mae(performance_changepoint$y, performance_changepoint$yhat)
print(paste("The MAE for the changepoint model is", changepoint_mae))[1] "The MAE for the changepoint model is 78.6442141376874"
# Check MASE value
changepoint_mase <- mase(performance_changepoint$y, performance_changepoint$yhat)
print(paste("The MASE for the changepoint model is", changepoint_mase))[1] "The MASE for the changepoint model is 1.28752769213446"
# Add seasonality and fit to training set
model_season <- prophet(train_data,
yearly.seasonality = FALSE,
weekly.seasonality = TRUE)# Create time range for forecast
future_season <- make_future_dataframe(model_season, freq = "week", periods = 8)
# Make prediction
forecast_season <- predict(model_season, future_season)
# Visualize the forecast
plot(model_season, forecast_season) +
labs(title = "Seasonal Prophet Model")prophet_plot_components(model_season, forecast_season)# Merge actual and predicted values
forecast_season2 <- forecast_season %>%
filter(ds > as.POSIXct("2023-01-01"))
performance_season <- full_join(test_data, forecast_season2, by = join_by(ds == ds)) %>%
select(ds, y, yhat)
# performance_season <- performance_season %>%
# mutate(y = exp(y),
# yhat = exp(yhat))
ggplot(performance_season) +
geom_line(aes(x = ds, y = y, color = "orange"), linewidth = 1) +
#geom_point(aes(x = ds, y = y, color = "orange")) +
geom_line(aes(x = ds, y = yhat, color = "dodgerblue"), linewidth = 1) +
#geom_point(aes(x = ds, y = yhat, color = "dodgerblue")) +
scale_color_manual(name="Value", values = c("orange", "dodgerblue"), labels = c("Predicted", "Actual")) +
labs(title = "Seasonality Prophet Test Set",
y = "Deaths",
x = "Dates") +
theme_bw() +
theme(legend.title = element_blank())# Check MAE value
season_mae <- mae(performance_season$y, performance_season$yhat)
print(paste("The MAE for the seasonality model is", season_mae))[1] "The MAE for the seasonality model is 46.7177816171926"
# Check MASE value
season_mase <- mase(performance_season$y, performance_season$yhat)
print(paste("The MASE for the seasonality model is", season_mase))[1] "The MASE for the seasonality model is 0.764842502487424"
train_data2 <- train_data %>%
select(-c(ds, y))
features <- colnames(train_data2)
# Add seasonality
model_multivariate <- prophet(train_data, fit = FALSE)
# Add all regressors
for (name in features){
model_multivariate <- add_regressor(model_multivariate, name)
}
# Fit on training set
multi_fit <- fit.prophet(model_multivariate, train_data)# Create time range for forecast
future_regressor <- make_future_dataframe(multi_fit, freq = "week", periods = 8)
# Append the regressor values
future_regressor <- full_join(future_regressor, train_data, by = join_by(ds == ds))
# Make prediction
forecast_regressor <- predict(multi_fit, selected_deaths)
# Visualize the forecast
plot(multi_fit, forecast_regressor) +
labs(title = "Regressor Prophet Model")prophet_plot_components(multi_fit, forecast_regressor)# Merge actual and predicted values
forecast_regressor2 <- forecast_regressor %>%
filter(ds > as.POSIXct("2023-01-01")) %>%
select(ds, yhat) %>%
mutate(index = seq(1:332))
test_data <- test_data %>%
mutate(index = seq(1:332))
performance_regressor <- full_join(test_data, forecast_regressor2, by = join_by(index == index, ds == ds)) %>%
select(ds, y, yhat, index)
performance_regressor <- performance_regressor %>%
mutate(yhat = case_when((yhat < 0) ~ 0,
.default = yhat))
# performance_regressor <- performance_regressor %>%
# mutate(y = exp(y),
# yhat = exp(yhat))
# Plotting Test
ggplot(performance_regressor) +
geom_line(aes(x = index, y = y, color = "orange")) +
# geom_point(aes(x = index, y = y, color = "orange")) +
geom_line(aes(x = index, y = yhat, color = "dodgerblue")) +
#geom_point(aes(x = index, y = yhat, color = "dodgerblue")) +
scale_color_manual(name="Value", values = c("orange", "dodgerblue"), labels = c("Predicted", "Actual")) +
labs(title = "Regressor Prophet Test Set",
y = "Deaths",
x = "Index") +
theme_bw() +
theme(legend.title = element_blank()) +
coord_fixed(ratio = 0.2)# Check MAE value
regressor_mae <- mae(performance_regressor$y, performance_regressor$yhat)
print(paste("The MAE for the regressor model is", regressor_mae))[1] "The MAE for the regressor model is 47.2132331706774"
# Check MASE value
regressor_mase <- mase(performance_regressor$y, performance_regressor$yhat)
print(paste("The MASE for the regressor model is", regressor_mase))[1] "The MASE for the regressor model is 0.77295381241934"
# Check MAPE value
regressor_mape <- MAPE(performance_regressor$yhat, performance_regressor$y)
print(paste("The MAPE for the regressor model is", regressor_mape))[1] "The MAPE for the regressor model is 0.880915064442601"
events <- tribble(
~holiday, ~ds, ~lower_window, ~upper_window,
#--|--|----
"COVID", as.Date('2020-03-14'), -15, 15,
"superbowl", as.Date('2020-02-02'), -7, 7,
"superbowl", as.Date('2021-02-07'), -7, 7,
"superbowl", as.Date('2022-02-13'), -7, 7,
"superbowl", as.Date('2023-02-12'), -7, 7,
)
events# A tibble: 5 × 4
holiday ds lower_window upper_window
<chr> <date> <dbl> <dbl>
1 COVID 2020-03-14 -15 15
2 superbowl 2020-02-02 -7 7
3 superbowl 2021-02-07 -7 7
4 superbowl 2022-02-13 -7 7
5 superbowl 2023-02-12 -7 7
# Add holidays
model_holiday <- prophet(train_data, holidays = events, fit = FALSE)
# Add built-in country-specific holidays
model_holiday <- add_country_holidays(model_holiday, country_name = 'US')
# Fit model on training
holiday_fit <- fit.prophet(model_holiday, train_data)# Create time range for forecast
future_holiday <- make_future_dataframe(holiday_fit, freq = "week", periods = 8)
# Append the regressor values
future_holiday <- left_join(future_holiday, test_data, by = join_by(ds == ds))
# Make prediction
forecast_holiday <- predict(holiday_fit, future_holiday)
# Visualize the forecast
plot(holiday_fit, forecast_holiday) +
labs(title = "Holiday Prophet Model")# Visualize the forecast components
prophet_plot_components(holiday_fit, forecast_holiday)# Merge actual and predicted values
forecast_holiday2 <- forecast_holiday %>%
filter(ds > as.POSIXct("2023-01-01")) %>%
select(ds, yhat) %>%
mutate(index = seq(1:332))
test_data <- test_data %>%
mutate(index = seq(1:332))
performance_holiday <- full_join(test_data, forecast_holiday2, by = join_by(ds == ds, index == index)) %>%
select(ds, y, yhat)
# performance_holiday <- performance_holiday %>%
# mutate(y = exp(y),
# yhat = exp(yhat))ggplot(performance_holiday) +
geom_line(aes(x = ds, y = y, color = "orange"), linewidth = 1) +
#eom_point(aes(x = ds, y = y, color = "orange")) +
geom_line(aes(x = ds, y = yhat, color = "dodgerblue"), linewidth = 1) +
#geom_point(aes(x = ds, y = yhat, color = "dodgerblue")) +
scale_color_manual(name="Value", values = c("orange", "dodgerblue"), labels = c("Predicted", "Actual")) +
labs(title = "Holiday Prophet Test Set",
y = "Deaths",
x = "Dates") +
theme_bw() +
theme(legend.title = element_blank())# Check MAE value
holiday_mae <- mae(performance_holiday$y, performance_holiday$yhat)
print(paste("The MAE for the holiday model is", holiday_mae))[1] "The MAE for the holiday model is 102.366540144566"
# Check MASE value
holiday_mase <- mase(performance_holiday$y, performance_holiday$yhat)
print(paste("The MASE for the holiday model is", holiday_mase))[1] "The MASE for the holiday model is 1.67589894093636"
# Check MAPE value
# holiday_mape <- MAPE(performance_holiday$yhat, performance_holiday$y)
# print(paste("The MAPE for the holiday model is", holiday_mape))# logging values
train_data <- train_data %>%
mutate(y = case_when((y == 0) ~ 1,
.default = y)) %>%
mutate(y = log(y))
test_data <- test_data %>%
mutate(y = log(y))
# Adding seasonality, holidays, changepoints
best_prophet <- prophet(train_data,
yearly.seasonality = TRUE,
weekly.seasonality = TRUE,
holidays = events,
n.changepoints = 3,
fit = FALSE)
best_prophet <- add_country_holidays(best_prophet, country_name = 'US')
# Adding features
for (name in features){
best_prophet <- add_regressor(best_prophet, name)
}
# Fit model
best_fit <- fit.prophet(best_prophet, train_data)# Create time range for forecast
future_best <- make_future_dataframe(best_fit, freq = "week", periods = 8)
# Append the regressor values
future_best <- left_join(future_best, train_data, by = join_by(ds == ds))
# Make prediction
forecast_best <- predict(best_fit, selected_deaths)# unlog
forecast_best1 <- forecast_best %>%
mutate(yhat = exp(yhat))
# Visualize the forecast
plot(best_fit, forecast_best) +
labs(title = "Best Prophet Model")forecast_best2 <- forecast_best %>%
mutate(yhat = exp(yhat),
ds = as.Date(ds)) %>%
select(ds, yhat) %>%
mutate(index = seq(1:7306))
selected_deaths2 <- selected_deaths %>%
select(ds, y) %>%
mutate(index = seq(1:7306))
best_predictions <- full_join(forecast_best2, selected_deaths2, join_by(ds == ds, index == index))
# save(best_predictions, file = "results/best_multi_prophet_pred.rda")
best_predictions2 <- best_predictions %>%
filter(ds >= as.Date("2023-01-01"))
ggplot() +
geom_line(data = best_predictions2, aes(x = index, y = y, color = "orange"), alpha = 0.55) +
geom_line(data = best_predictions2, aes(x = index, y = yhat, color = "dodgerblue"), alpha = 0.85) +
scale_color_manual(name="Value", values = c("orange", "dodgerblue"), labels = c("Predicted", "Actual")) +
scale_x_continuous(breaks = c(7000, 7150, 7300),
labels = c("Jan 2023", "Feb 2023", "March 2023")) +
theme_bw() +
theme(legend.title = element_blank()) +
labs(x = NULL,
y = NULL) +
theme(aspect.ratio=0.2) # Merge actual and predicted values
forecast_best2 <- forecast_best %>%
filter(ds > as.POSIXct("2023-01-01")) %>%
select(ds, yhat) %>%
mutate(index = seq(1:332))
test_data <- test_data %>%
mutate(index = seq(1:332))
performance_best <- full_join(test_data, forecast_best2, by = join_by(index == index, ds == ds)) %>%
select(ds, y, yhat, index)
performance_best <- performance_best %>%
mutate(y = exp(y),
yhat = exp(yhat))best_test_plot <- ggplot(performance_best) +
geom_line(aes(x = index, y = y, color = "orange")) +
#geom_point(aes(x = index, y = y, color = "orange")) +
geom_line(aes(x = index, y = yhat, color = "dodgerblue")) +
#geom_point(aes(x = index, y = yhat, color = "dodgerblue")) +
scale_color_manual(name="Value", values = c("orange", "dodgerblue"), labels = c("Predicted", "Actual")) +
scale_x_continuous(breaks = c(75, 185, 300),
labels = c("Jan 2023", "Feb 2023", "March 2023")) +
labs(title = "Best Prophet Test Set",
y = "Deaths",
x = "Dates") +
coord_fixed(ratio = 0.2) +
theme(legend.title = element_blank()) +
theme_bw()
best_test_plot# Check MAE value
best_mae <- mae(performance_best$y, performance_best$yhat)
print(paste("The MAE for the best model is", best_mae))[1] "The MAE for the best model is 30.8251140866864"
# Check MASE value
best_mase <- mase(performance_best$y, performance_best$yhat)
print(paste("The MASE for the best model is", best_mase))[1] "The MASE for the best model is 0.504654899727629"
# Check MAPE value
best_mape <- MAPE(performance_best$yhat, performance_best$y)
print(paste("The MAPE for the best model is", best_mape))[1] "The MAPE for the best model is 0.440957849425632"
# Printing All of them together
results <- tribble(
~model, ~mae, ~mase,
#--|--|----
"baseline", baseline_mae, baseline_mase,
"changepoint", changepoint_mae, changepoint_mase,
"holiday", holiday_mae, holiday_mase,
"regressors", regressor_mae, regressor_mase,
"season", season_mae, season_mase,
"best", best_mae, best_mase
) %>%
kbl() %>%
kable_styling()
results| model | mae | mase |
|---|---|---|
| baseline | 109.39576 | 1.7909781 |
| changepoint | 78.64421 | 1.2875277 |
| holiday | 102.36654 | 1.6758989 |
| regressors | 47.21323 | 0.7729538 |
| season | 46.71778 | 0.7648425 |
| best | 30.82511 | 0.5046549 |
# save best model
save(results, best_fit, performance_best, forecast_best, best_mase, best_mae,
file = "results/multi_prophet.rda")